In [1]:
from plotly.offline import init_notebook_mode, iplot
from sqlalchemy import create_engine
from IPython.display import display
import chart_studio.plotly as py
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import dask.dataframe as dd
from datetime import datetime
import pandas as pd
import hvplot.dask
import os.path
import pathlib
import pyodbc 
import plotly.express as px
import seaborn as sns
In [2]:
file='full.csv'
required_columns=['FL_DATE','DISTANCE','DEP_DELAY','OP_CARRIER','ARR_DELAY','CANCELLED']
In [3]:
dask_df = dd.read_csv(file, usecols=required_columns)
In [4]:
dask_df['DISTANCE'].mean().compute()
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

Out[4]:
787.5098978247175
In [5]:
dask_df.describe()
Out[5]:
Dask DataFrame Structure:
DEP_DELAY ARR_DELAY CANCELLED DISTANCE
npartitions=1
float64 float64 float64 float64
... ... ... ...
Dask Name: describe-numeric, 3427 tasks
In [6]:
dask_df = dask_df.mask(dask_df == 'UA','United Airlines')
dask_df = dask_df.mask(dask_df == 'XE','JSX')
dask_df = dask_df.mask(dask_df == 'AS','Alaska Airlines')
dask_df = dask_df.mask(dask_df == '9E','Endeavor Air')
dask_df = dask_df.mask(dask_df == 'B6','JetBlue Airways')
dask_df = dask_df.mask(dask_df == 'EV','ExpressJet')
dask_df = dask_df.mask(dask_df == 'F9','Frontier Airlines')
dask_df = dask_df.mask(dask_df == 'G4','Allegiant Air')
dask_df = dask_df.mask(dask_df == 'HA','Hawaiian Airlines')
dask_df = dask_df.mask(dask_df == 'MQ','Envoy Air')
dask_df = dask_df.mask(dask_df == 'NK','Spirit Airlines')
dask_df = dask_df.mask(dask_df == 'OH','PSA Airlines')
dask_df = dask_df.mask(dask_df == 'OO','SkyWest Airlines')
dask_df = dask_df.mask(dask_df == 'VX','Virgin America')
dask_df = dask_df.mask(dask_df == 'WN','Southwest Airlines')
dask_df = dask_df.mask(dask_df == 'YV','Mesa Airline')
dask_df = dask_df.mask(dask_df == 'YX','Republic Airways')
dask_df = dask_df.mask(dask_df == 'AA','American Airlines')
dask_df = dask_df.mask(dask_df == 'DL','Delta Airlines')
In [7]:
# Arrival Delays by airlines
series = dask_df.groupby('OP_CARRIER').ARR_DELAY.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Opóźnienia przylotów przez linie lotnicze')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [8]:
# Departue Delay by airlines
series = dask_df.groupby('OP_CARRIER').DEP_DELAY.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [9]:
# Najpopularniejszy przewoźnik
series = dask_df['OP_CARRIER'].value_counts().compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Najpopularniejszy przewoźnik')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [10]:
dask_df.nlargest(20,'DISTANCE').compute()
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

Out[10]:
FL_DATE OP_CARRIER DEP_DELAY ARR_DELAY CANCELLED DISTANCE
92231 2019-04-04 NaN -5.0 -9.0 0.0 5095.0
104701 2019-04-05 NaN -2.0 12.0 0.0 5095.0
104702 2019-04-05 NaN 19.0 -2.0 0.0 5095.0
128300 2019-04-06 NaN -5.0 26.0 0.0 5095.0
128301 2019-04-06 NaN -3.0 -43.0 0.0 5095.0
147072 2019-04-07 NaN 2.0 35.0 0.0 5095.0
147073 2019-04-07 NaN 5.0 -10.0 0.0 5095.0
175210 2019-04-08 NaN 13.0 48.0 0.0 5095.0
175211 2019-04-08 NaN 3.0 -41.0 0.0 5095.0
184069 2019-04-09 NaN 5.0 3.0 0.0 5095.0
226229 2019-04-11 NaN 28.0 50.0 0.0 5095.0
259306 2019-04-12 NaN 37.0 6.0 0.0 5095.0
259307 2019-04-12 NaN 5.0 12.0 0.0 5095.0
269696 2019-04-13 NaN -1.0 -13.0 0.0 5095.0
269697 2019-04-13 NaN 0.0 -13.0 0.0 5095.0
290432 2019-04-14 NaN -1.0 -30.0 0.0 5095.0
290433 2019-04-14 NaN 2.0 30.0 0.0 5095.0
305004 2019-04-15 NaN 2.0 -20.0 0.0 5095.0
305005 2019-04-15 NaN 47.0 35.0 0.0 5095.0
338635 2019-04-16 NaN 5.0 23.0 0.0 5095.0
In [11]:
dask_df.nsmallest(20,'DISTANCE').compute()
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

Out[11]:
FL_DATE OP_CARRIER DEP_DELAY ARR_DELAY CANCELLED DISTANCE
97982 2009-02-03 American Airlines 71.0 NaN 1.0 11.0
68275 2009-07-07 PSA Airlines 65.0 35.0 0.0 11.0
369199 2013-07-27 US NaN NaN 1.0 17.0
360261 2009-01-21 American Airlines -5.0 -5.0 0.0 21.0
9674 2015-04-20 American Airlines 112.0 NaN 1.0 21.0
58063 2012-12-10 American Airlines 1170.0 1182.0 0.0 24.0
7862 2014-03-12 ExpressJet NaN NaN 1.0 24.0
137492 2009-09-26 SkyWest Airlines NaN NaN 1.0 25.0
180321 2009-09-26 SkyWest Airlines NaN NaN 1.0 25.0
86257 2009-11-18 SkyWest Airlines NaN NaN 1.0 25.0
444711 2016-11-28 SkyWest Airlines 232.0 NaN 0.0 25.0
420395 2014-02-07 ExpressJet 38.0 NaN 0.0 26.0
214741 2016-08-14 ExpressJet 7.0 56.0 0.0 28.0
488225 2020-05-08 NaN 22.0 35.0 0.0 29.0
492621 2020-05-09 NaN 0.0 12.0 0.0 29.0
497843 2020-05-10 NaN -3.0 7.0 0.0 29.0
2381 2020-05-11 NaN -9.0 NaN 0.0 29.0
20746 2020-05-14 NaN -1.0 1.0 0.0 29.0
23281 2020-05-15 NaN -3.0 8.0 0.0 29.0
32591 2020-05-16 NaN 16.0 37.0 0.0 29.0
In [12]:
# łączna pokonana odległość przez przewoźnika
series = dask_df.groupby('OP_CARRIER').DISTANCE.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='łączna pokonana odległość przez przewoźnika')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [13]:
# odlowania 2009-2020
series = dask_df.groupby('FL_DATE').CANCELLED.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [14]:
# odlowania 2009-2020
series = dask_df.groupby('FL_DATE').DEP_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [15]:
# opoznienie odlotu 2009-2020
series = dask_df.groupby('FL_DATE').ARR_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia przylotu przez linie lotnicze')
C:\ProgramData\Anaconda3\lib\site-packages\dask\core.py:121: DtypeWarning:

Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.

In [16]:
file='20192020.csv'
required_columns=['FL_DATE','DISTANCE','DEP_DELAY','ARR_DELAY','CANCELLED']
dask_df = dd.read_csv(file, usecols=required_columns)
In [17]:
# odlowania 2020
series = dask_df.groupby('FL_DATE').CANCELLED.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
In [18]:
# odlowania 2019-2020
series = dask_df.groupby('FL_DATE').DEP_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
In [19]:
# opoznienie odlotu 2019-2020
series = dask_df.groupby('FL_DATE').ARR_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia przylotu przez linie lotnicze')
In [ ]: